Data input

nucs <- read.csv("EH23a.softmasked_nuccomp.csv")
nucs$GC <- rowSums( nucs[, c("C", "c", "G", "g")] )/rowSums( nucs[, c("A", "a", "C", "c", "G", "g", "T", "t")] )
nucs$GC <- nucs$GC * 100
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#
gff[1:3, 1:8]
##           V1         V2     V3    V4       V5 V6 V7 V8
## 1 EH23a.chr1 annotation remark     1 65878799  .  .  .
## 2 EH23a.chr1    feature   gene 11068    24494  .  +  .
## 3 EH23a.chr1    feature   mRNA 11068    24494  .  +  .
genes <- gff[ gff[, 3] == "gene", ]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")

gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)

gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
##           V1   V2                        V3    V4    V5 V6 V7 V8 chrn
## 1 EH23a.chr1 EDTA             repeat_region 48463 58586  .  -  .    1
## 2 EH23a.chr1 EDTA   target_site_duplication 48463 48467  .  -  .    1
## 3 EH23a.chr1 EDTA      long_terminal_repeat 48468 50359  .  -  .    1
## 4 EH23a.chr1 EDTA Copia_LTR_retrotransposon 48468 58581  .  -  .    1
## 5 EH23a.chr1 EDTA      long_terminal_repeat 56690 58581  .  -  .    1
## 6 EH23a.chr1 EDTA   target_site_duplication 58582 58586  .  -  .    1
## 7 EH23a.chr1 EDTA             repeat_region 67125 71966  .  +  .    1
## 8 EH23a.chr1 EDTA   target_site_duplication 67125 67129  .  +  .    1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- gff[ gff$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
##             V1   V2                        V3      V4      V5 V6 V7 V8 chrn
## 81  EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2110985 2122540  .  ?  .    1
## 87  EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2180122 2191426  .  +  .    1
## 161 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3614107 3626646  .  -  .    1
## 206 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3905221 3916714  .  ?  .    1
## 228 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4168059 4180534  .  -  .    1
## 242 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4447228 4458755  .  -  .    1
## 283 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4885633 4891899  .  -  .    1
## 305 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 5232888 5239051  .  +  .    1
##     classification
## 81             Ty3
## 87             Ty3
## 161            Ty3
## 206            Ty3
## 228            Ty3
## 242            Ty3
## 283            Ty3
## 305            Ty3
#table(gff$classification, gff$V1)
tetbl <- table(gff$V1, gff$classification)
tetbl <- rbind(tetbl, colMeans(tetbl))
#tetbl

Ty3$motif <- sub("^motif=", "", unlist(lapply(strsplit(Ty3[, 9], split = ";"), function(x){ grep("motif", x, value = TRUE) })))

my_tbl <- table(Ty3$motif, Ty3$chrn)
#my_tbl[, 1:10]
my_mat <- as.matrix(my_tbl)
#my_mat[, 1:10]

my_mat2 <- my_mat[apply(my_mat, MARGIN = 1, function(x){ sum( x != 0 ) }) == 1, ]

my_mat2 <- my_mat2[sort.int(as.numeric(apply(my_mat2, MARGIN = 1, function(x){ paste(x, collapse = "") })), 
         index.return = T)$ix, ]
my_mat2
##       
##        1 2 3 4 5 6 7 8 9 10
##   TTTA 0 0 0 0 0 0 0 0 1  0
##   TGAG 0 0 0 0 0 0 0 1 0  0
##   TATC 0 0 0 0 0 0 1 0 0  0
##   TATT 0 0 0 0 0 0 1 0 0  0
##   TTCC 0 0 0 0 0 0 1 0 0  0
##   TGTG 0 0 0 0 0 1 0 0 0  0
##   TCAC 0 0 0 0 1 0 0 0 0  0
##   TAGC 0 0 0 0 2 0 0 0 0  0
##   TACC 0 0 0 1 0 0 0 0 0  0
##   TTAG 0 0 0 1 0 0 0 0 0  0
##   TTGC 0 0 0 1 0 0 0 0 0  0
##   TAAT 0 0 1 0 0 0 0 0 0  0
##   TCCA 0 0 1 0 0 0 0 0 0  0
##   TGCT 0 0 1 0 0 0 0 0 0  0
##   TAAC 0 1 0 0 0 0 0 0 0  0
##   TCGT 0 1 0 0 0 0 0 0 0  0
##   TCTG 0 1 0 0 0 0 0 0 0  0
##   TACT 1 0 0 0 0 0 0 0 0  0
##   TGTC 1 0 0 0 0 0 0 0 0  0
##   TTCA 1 0 0 0 0 0 0 0 0  0
#wins <- read.csv("EH23a.softmasked_wins1e5.csv")
#
wins <- read.csv("EH23a.softmasked_wins1e6.csv")
#wins <- read.csv("EH23a.softmasked_wins1e7.csv")
wins$CGs <- 0
for( i in unique(wins$Id) ){
  wins$CGs[ wins$Id == i] <- wins$CG[ wins$Id == i] - min(wins$CG[ wins$Id == i], na.rm = TRUE)
  wins$CGs[ wins$Id == i] <- wins$CGs[ wins$Id == i]/max(wins$CGs[ wins$Id == i], na.rm = TRUE)
}
wins$chrn <- sub(".+chr", "", wins$Id)
wins$chrn[ wins$chrn == "X" ] <- 10
wins$chrn <- as.numeric(wins$chrn)
wins$ATs <- 1 - wins$CGs
#wins[1:3, ]

cwins <- wins[wins$CGs == 1 & !is.na(wins$CGs) , ]
cwins <- cwins[ !duplicated( cwins$Id ), ]
cwins$cent <- cwins$Start + cwins$Win_length/2

Variant data

vars <- read.table("ERBxHO40vars.csv.gz", header = TRUE, sep = ",")
nrow(vars)
## [1] 450759
vars <- vars[vars$Missing == 0, ]
nrow(vars)
## [1] 35186
vars[1:3, ]
##      CHROM    POS   n Allele_counts        He       Ne REFhom HET ALThom
## 48  chr_1e  62350 270        509,31 0.1082236 1.121357    240  29      1
## 102 chr_1e  98473 270       412,128 0.3617010 1.566664    145 122      3
## 220 chr_1e 204226 270       263,277 0.4996639 1.998657     65 133     72
##     Missing
## 48        0
## 102       0
## 220       0
vars$ERBallele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[1]})))
vars$HO40allele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[2]})))

vars$ERBallele <- vars$ERBallele / (vars$n * 2)
vars$HO40allele <- vars$HO40allele / (vars$n * 2)


my_chrom <- "chr_1e"
plot( vars$POS[ vars$CHROM == my_chrom ], vars$ERBallele[ vars$CHROM == my_chrom ],
      pch = 20, col = "#C0C0C066", ylim = c(0, 1))

points( vars$POS[ vars$CHROM == my_chrom ], vars$HO40allele[ vars$CHROM == my_chrom ],
      pch = 20, col = "#4682B4")


vars$chrn <- sub("chr_", "", vars$CHROM)
vars$chrn <- sub("e$", "", vars$chrn)
vars$chrn[ vars$chrn == "X" ] <- 10
vars$chrn <- as.numeric(vars$chrn)
table(vars$chrn)
## 
##    1    2    3    4    5    6    7    8    9   10 
## 2933 3228 4869 5716 3184 2886 2317 1975 4690 3388
library(ggplot2)

ggplot(vars, aes(x=chrn, y=POS, group=CHROM)) +
  geom_point()

my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Set3"), "08", sep="")
#my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Paired"), "08", sep="")

hist(vars$ERBallele[vars$chrn == 2], xlim = c(0, 1))

hist(vars$HO40allele[vars$chrn == 2], xlim = c(0, 1))

p <- ggplot(vars, aes(x=chrn+ERBallele-0.5, y=POS, group=CHROM)) +
  geom_point( aes( color = CHROM ) )
p <- p + scale_color_manual(values=my_cols1)
#p <- p + geom_point( data = vars, aes(x = chrn-HO40allele, y=POS), color = vars$chrn)
#
p <- p + geom_point( data = vars, aes(x = chrn+HO40allele-0.5, y=POS), color = my_cols1[vars$chrn])
#p <- p + geom_point( data = vars, aes(x = chrn-0, y=POS), color = my_cols1[vars$chrn])
#p <- p + scale_color_manual(values=my_cols1)
p <- p + theme_bw()
p <- p + theme(legend.position='none')
p <- p + scale_x_continuous(breaks=seq(1, 10, 1))
p <- p + xlab("Chromosome")
p

#p + scale_color_brewer(palette="Dark2")
#p + scale_color_brewer(palette="Set3")
#  geom_point( aes( color = CHROM, alpha = 1/10 ) )
#  geom_point( aes( alpha = 1/10 ) )

Windowize

# wins[1:3, ]
# genes[1:3, 1:8]

wins$gcnt <- 0
wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
   tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
   wins$gcnt[i] <- nrow(tmp)
   
   tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]
   wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
   wins$ty3cnt[i] <- sum(tmp$classification == "Ty3")
}

wins$gcntsc <- wins$gcnt - min(wins$gcnt)
wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
#range(round( wins$gcntsc * 100 ))
my_index <- round( wins$gcntsc * 100 )
#my_index <- 100 - my_index
my_index[ my_index <= 0] <- 1

#wins$gcol <- heat.colors(n=100)[ my_index ]
#wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
#
wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]


wins$chr <- sub("^.+chr", "chr", wins$Id)
#
wins[1:3, ]
##           Id Win_number   Start     End Win_length      A      C      G      T
## 1 EH23a.chr1          0       0  999999    1000000 339510 159349 157789 343352
## 2 EH23a.chr1          1 1000000 1999999    1000000 346184 152100 152139 349577
## 3 EH23a.chr1          2 2000000 2999999    1000000 343278 155236 154692 346794
##      CG   CHG   CHH        CGs chrn       ATs gcnt ty1cnt ty3cnt    gcntsc
## 1 14003 19695 90544 0.07933312    1 0.9206669  156     24      0 1.0000000
## 2 13018 18441 88693 0.00000000    1 1.0000000  141     24      0 0.9038462
## 3 14100 19606 89229 0.08714562    1 0.9128544  123     18     12 0.7884615
##      gcol  chr
## 1 #FFFF00 chr1
## 2 #FFEC00 chr1
## 3 #FFD800 chr1
# Cannabinoid BLAST results.

cann_blst <- read.csv("cann_3dom_EH23a_tblastn.csv", header = FALSE)
colnames(cann_blst) <- c('qseqid','qlen','sseqid','slen','qstart',
                         'qend','sstart','send','evalue','bitscore',
                         'score','length','pident','nident','mismatch',
                         'positive','gapopen','gaps','ppos','sstrand',
                         'sseq')
#
cann_blst <- cann_blst[grep("SignalP|FAD|BBE", cann_blst$qseqid, invert = TRUE, value = F), ]
cann_blst$name <- sub("^.+_", "", cann_blst$qseqid)

# table(cann_blst$qlen)
cann_blst <- cann_blst[ cann_blst$gaps <= 0, ]
cann_blst <- cann_blst[ cann_blst$mismatch <= 10, ]

# write.table(cann_blst, file = "cann_3dom_EH23a_tblastn_filter.csv",
#             sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE)
# system("~/gits/hempy/bin/blast_to_gff.py cann_3dom_EH23a_tblastn_filter.csv")

cann_blst$chrn <- as.numeric(sub(".+chr", "", cann_blst$sseqid))

#table(cann_blst$qseqid)

# cann_blst[1:3, 1:10]
# cann_blst[1:3, 10:20]

knitr::kable(cann_blst[, c(1, 2, 3, 7, 13:15, 18, 23)], caption = "**Table 1. Cannabinoid synthase genes.**")
Table 1. Cannabinoid synthase genes.
qseqid qlen sseqid sstart pident nident mismatch gaps chrn
1 AB212829_THCAS1 545 EH23a.chr7 11346499 99.817 544 1 0 7
130 AB212830_THCAS2 545 EH23a.chr7 30706708 99.633 543 2 0 7
131 AB212830_THCAS2 545 EH23a.chr7 30774874 99.450 542 3 0 7
132 AB212830_THCAS2 545 EH23a.chr7 30824591 99.266 541 4 0 7
374 AB292683_CBDAS2 546 EH23a.chr7 12111396 98.716 538 7 0 7
504 AB292684_CBDAS3 546 EH23a.chr7 12111396 98.165 535 10 0 7
634 LY658671.1_CBCAS1 545 EH23a.chr7 30706708 99.817 544 1 0 7
635 LY658671.1_CBCAS1 545 EH23a.chr7 30774874 99.633 543 2 0 7
636 LY658671.1_CBCAS1 545 EH23a.chr7 30824591 99.450 542 3 0 7
#nrow(cann_blst)
blst <- read.csv("EH23a_blastn.csv", header = FALSE)
colnames(blst) <- c('qseqid','qlen','sseqid','slen','qstart',
                    'qend','sstart','send','evalue','bitscore',
                    'score','length','pident','nident','mismatch',
                    'positive','gapopen','gaps','ppos','sstrand',
                    #'staxids','sblastnames','salltitles',
                    'sseq')#[1:21]

#blst$sseqid <- factor(blst$sseqid, levels = c("EH23a.chr1", "EH23a.chr2", "EH23a.chr3", "EH23a.chr4", "EH23a.chr5", "EH23a.chr6", "EH23a.chr7", "EH23a.chr8", "EH23a.chr9", "EH23a.chrX"))
#blst <- blst[blst$qseqid == "CsatSD_centromere_370bp", ]
#table(blst$qseqid)
# blst[1:3, 1:10]

blst$chrn <- sub(".+chr", "", blst$sseqid)
blst$chrn[ blst$chrn == "X" ] <- 10
blst$chrn <- as.numeric(blst$chrn)

subt <- blst[grep("CsatSD_centromere_370bp", blst$qseqid), ]
cent <- blst[grep("CsatSD_centromere_237bp", blst$qseqid), ]

#blst
#knitr::kable(blst[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
knitr::kable(subt[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
Table X. Blast.
qseqid qlen sseqid sstart pident nident mismatch gaps chrn
CsatSD_centromere_370bp 370 EH23a.chrX 34848 100.00 370 0 0 10
CsatSD_centromere_370bp 370 EH23a.chrX 817 99.73 369 1 0 10
CsatSD_centromere_370bp 370 EH23a.chrX 6368 99.73 369 1 0 10

Ideo function

plot_ideo <- function() {
  suppressPackageStartupMessages(require(ggplot2))

  # marker_df <- data.frame(
  #   chrom = rep(names(map), times = lapply(map, length)),
  #   pos = unlist(map),
  #   marker = names(unlist(map))
  # )
  # marker_df$chromf <- factor( marker_df$chrom, levels = names(map) )
  # marker_df$chromn <- as.numeric( marker_df$chromf )

  # chr_df <- data.frame(
  #   start = unlist(lapply(map, min)),
  #   end = unlist(lapply(map, max))
  # )
  # chr_df$chr <- names(map)
  # chr_df$chrf <- factor( chr_df$chr, levels = names(map))
  # chr_df$chrn <- as.numeric( chr_df$chrf )

  chr_df <- data.frame(
    start = 1,
    end = nucs$Length,
    chr = nucs$Id
  )
  chr_df$chrn <- sub(".+chr", "", nucs$Id)
  chr_df$chrn[chr_df$chrn == "X"] <- 10
  chr_df$chrn <- as.numeric(chr_df$chrn)
  
  
  chrom_wid <- 0.02
  p <- ggplot2::ggplot()
  p <- p + ggplot2::geom_rect( data = chr_df, 
                               ggplot2::aes( xmin = chrn - chrom_wid,
                                  xmax = chrn + chrom_wid,
                                  #xmin = as.numeric(as.factor(chr)) - chrom_wid,
                                  #xmax = as.numeric(as.factor(chr)) + chrom_wid,
                                  ymin = end, ymax = start), 
                      #fill = "#C0C0C0",
                      fill = "#DCDCDC",
                      #fill = "#F5F5F5",
                      color = "#000000"
                      )
  #p <- p + scale_y_reverse(limits = c(max_gd, 0))
  
  #wins$Id
  thinw <- 0.28
  p <- p + ggplot2::geom_rect( 
    data = wins, 
    ggplot2::aes( 
      # xmin = chrn - CGs,
      # xmax = chrn + CGs,
      xmin = chrn - ATs * thinw,
      xmax = chrn + ATs * thinw,
      ymin = Start, 
      ymax = End),
    fill = wins$gcol,
    #fill = "#0000CD",
    #fill = "#A9A9A9",
    #fill = "#C0C0C0",
    #fill = "#DCDCDC",
    #fill = "#F5F5F5",
#                  color = "#000000"
    color = NA
    )
  #p

  cmwidth <- 0.4
  p <- p + ggplot2::geom_rect( 
    data = cann_blst, 
    ggplot2::aes(
      xmin = chrn - cmwidth,
      xmax = chrn + cmwidth,
      ymin = sstart, 
      ymax = send),
    #fill = "#0000CD",
    #fill = "#A9A9A9",
    fill = "#C0C0C0",
    #fill = "#DCDCDC",
    #fill = "#F5F5F5",
    #color = "#000000"
    color = "#228B22"
    #color = NA
    )
  #p
  

  p <- p + annotate(geom="text", x=7.5, y=11.3e6, label="THCAS1",
              color="#228B22", size = 3)
  p <- p + annotate(geom="text", x=7.5, y=12.1e6, label="CBDAS2",
              color="#0000FF", size = 3)
  p <- p + annotate(geom="text", x=7.5, y=30.7e6, label="CBCAS",
              color="#B22222", bg = "#ffffff", size = 3)
  #p
  
  stwidth <- 0.40
  p <- p + ggplot2::geom_rect( 
    data = subt, 
    ggplot2::aes(
      xmin = chrn - stwidth,
      xmax = chrn + stwidth,
      #xmax = chrn,
      ymin = sstart, 
      ymax = send),
    #fill = "#0000CD",
    #fill = "#A9A9A9",
    fill = "#C0C0C0",
    #fill = "#DCDCDC",
    #fill = "#F5F5F5",
    #color = "#000000"
    color = "#0000FF"
    #color = "#228B22"
    #color = NA
    )
  #p
  
  mwidth <- 0.6
  p <- p + ggplot2::geom_rect( 
    data = cent, 
    ggplot2::aes(
      xmin = chrn - mwidth,
#      xmax = chrn + mwidth,
      xmax = chrn - 0.2,
      ymin = sstart, 
      ymax = send),
    #fill = "#0000CD",
    #fill = "#A9A9A9",
    fill = "#C0C0C0",
    #fill = "#DCDCDC",
    #fill = "#F5F5F5",
    color = "#000000"
    #color = "#0000FF"
    #color = "#8A2BE2"
    #color = "#228B22"
    #color = NA
    )
  #p
  

#   mwidth <- 0.4
#   p <- p + ggplot2::geom_rect( 
#     data = Ty3, 
#     ggplot2::aes(
#       xmin = chrn - 0,
# #      xmin = chrn - mwidth,
#       xmax = chrn + mwidth,
# #      xmax = chrn,
#       ymin = V4, 
#       ymax = V5),
#     fill = "#0000FF",
#     color = NA
#     )
#   #p
  
  
  # marker_wid <- 0.1
  # marker_high <- 0.4
  # p <- p + ggplot2::geom_rect( data = marker_df, 
  #                     ggplot2::aes( xmin = chromn - marker_wid,
  #                          xmax = chromn + marker_wid, 
  #                          #xmin = as.numeric(as.factor(chrom)) - marker_wid,
  #                          #xmax = as.numeric(as.factor(chrom)) + marker_wid,
  #                          ymin = pos - marker_high, ymax = pos + marker_high),
  #                     fill = "#228B22", color = "#228B22"
  # )
  # 
  # p <- p + scale_y_reverse( breaks = seq(0, 2e3, by = 100) )
  #p <- p + ggplot2::scale_y_reverse( minor_breaks = seq(0, 2e3, by = 20), breaks = seq(0, 2e3, by = 100) )
  #p <- p + scale_x_continuous( breaks = as.numeric(as.factor(chr_df$chr) ) )
  p <- p + ggplot2::scale_x_continuous( breaks = chr_df$chrn, labels = chr_df$chr )
  # p <- p + scale_y_reverse(limits = c(max_gd, 0))
  # p <- p + scale_y_reverse(limits = c(0, max_gd))
  #p <- p + theme_bw() + theme( panel.grid.minor = element_blank() )
  p <- p + ggplot2::theme_bw() + 
    ggplot2::theme( panel.grid.minor.x = ggplot2::element_blank(), 
           axis.text.x = element_text(angle = 60, hjust = 1),
           panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
           panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
           #panel.grid.major.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 1 ),
           #panel.grid.minor.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 3 )
          )
#  p <- p + ggplot2::xlab("Chromosome")
  #p <- p + ylab("Distance (cM)")
  #p <- p + ggplot2::ylab("Location (cM)")
  p <- p + ggplot2::ylab("Position (bp)")
  #p
  

  #p

  #return( invisible( NULL ) )
  p
}
p1 <- plot_ideo()
p1

Chrom by genes

my_data <- readr::read_csv("gene_counts.csv", )
## Rows: 69 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Sample
## dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX, chrY
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# names(my_data)[1] <- "Sample"
# my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8  chr9  chrX  chrY
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AH3Ma   4052  2763  2586  3189  2424  2535  2058  2892  2610  3689    NA
##  2 AH3Mb   4062  2807  2620  3224  2563  2604  2080  2967  2509    NA  2850
##  3 BCMa    4137  2878  2682  3256  2597  2599  2031  2953  2654  3792    NA
##  4 BCMb    4059  2703  2611  3098  2601  2615  1986  2905  2569    NA  2751
##  5 BOAXa   4097  2877  2680  3147  2444  2718  2015  2950  2612  3778    NA
##  6 BOAXb   4039  2816  2507  3159  2499  2586  1995  2914  2551  3670    NA
##  7 COFBa   4365  2681  2552  3035  2404  2457  1793  2840  2483  3625    NA
##  8 COFBb   4078  2960  2755  3300  2768  2750  2212  3058  2769  3944    NA
##  9 COSVa   4063  2834  2677  3130  2595  2580  2048  2940  2594  3722    NA
## 10 COSVb   4049  2846  2690  3175  2601  2645  2029  2977  2566  3801    NA
## # … with 59 more rows
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Count")
#data_long$Length <- data_long$Length / 1e6
gcount <- data_long
sum(is.na(data_long$Count))
## [1] 69
data_long <- data_long[!is.na(data_long$Count), ]
data_long
## # A tibble: 690 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # … with 680 more rows
my_data <- readr::read_csv("chrom_lengths.csv")
## New names:
## Rows: 69 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX,
## chrY
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(my_data)[1] <- "Sample"
my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample   chr1   chr2   chr3   chr4   chr5   chr6   chr7   chr8   chr9    chrX
##    <fct>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
##  1 AH3Ma  6.57e7 7.65e7 8.20e7 8.16e7 7.65e7 7.63e7 6.64e7 5.41e7 6.60e7  8.42e7
##  2 AH3Mb  6.63e7 7.62e7 8.11e7 8.09e7 7.80e7 7.64e7 6.65e7 5.52e7 6.50e7 NA     
##  3 BCMa   6.51e7 7.58e7 8.62e7 8.12e7 7.67e7 7.53e7 6.55e7 5.48e7 6.49e7  8.33e7
##  4 BCMb   6.32e7 7.83e7 8.16e7 7.84e7 7.69e7 7.68e7 6.44e7 5.42e7 6.55e7 NA     
##  5 BOAXa  6.53e7 7.96e7 8.18e7 7.80e7 7.64e7 7.91e7 6.40e7 5.48e7 6.53e7  8.32e7
##  6 BOAXb  6.50e7 7.76e7 7.95e7 7.97e7 7.57e7 7.58e7 6.49e7 5.44e7 6.60e7  8.26e7
##  7 COFBa  7.21e7 7.85e7 8.44e7 8.21e7 7.76e7 7.97e7 6.37e7 5.57e7 6.55e7  8.57e7
##  8 COFBb  6.47e7 7.71e7 8.21e7 8.16e7 7.77e7 7.71e7 6.84e7 5.52e7 6.63e7  8.46e7
##  9 COSVa  6.76e7 8.34e7 8.63e7 8.52e7 8.09e7 7.93e7 6.99e7 5.53e7 6.81e7  8.65e7
## 10 COSVb  6.61e7 8.15e7 8.74e7 8.40e7 8.46e7 8.30e7 6.50e7 5.75e7 6.78e7  8.58e7
## # … with 59 more rows, and 1 more variable: chrY <dbl>
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Length")
data_long$Length <- data_long$Length / 1e6
data_long
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # … with 749 more rows
glength <- data_long
glength
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # … with 749 more rows
gcount
## # A tibble: 759 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # … with 749 more rows
data_long <- cbind(glength, gcount$Count)
names(data_long)[4] <- "Count"
data_long[1:3, ]
##   Sample Chrom   Length Count
## 1  AH3Ma  chr1 65.67137  4052
## 2  AH3Ma  chr2 76.48150  2763
## 3  AH3Ma  chr3 81.99513  2586
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")

library(ggplot2)
# Basic scatter plot
p <- ggplot(data_long, aes(x=Length, y=Count, color=Chrom)) 
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)

p <- p + theme_bw()
p <- p + theme(legend.title = element_blank()) 
#p <- p + theme(legend.position = "left")
p <- p + theme(legend.position = "right")
#p <- p + theme( legend.spacing.y = unit(1.0, 'mm') )
## important additional element
#p <- p + guides(fill = guide_legend(byrow = TRUE))
p + theme(legend.spacing.y = unit(1.0, 'mm')) +
  guides(fill = guide_legend(byrow = TRUE))
## `geom_smooth()` using formula = 'y ~ x'

#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Chromosome length (Mbp)")
p <- p + ylab("Genes per chromosome")



p2 <- p
p2
## `geom_smooth()` using formula = 'y ~ x'

TE plot

#plot(wins$gcnt, wins$ty3cnt)
#plot(wins$gcnt, wins$ty1cnt)
wins[1:3, ]
##           Id Win_number   Start     End Win_length      A      C      G      T
## 1 EH23a.chr1          0       0  999999    1000000 339510 159349 157789 343352
## 2 EH23a.chr1          1 1000000 1999999    1000000 346184 152100 152139 349577
## 3 EH23a.chr1          2 2000000 2999999    1000000 343278 155236 154692 346794
##      CG   CHG   CHH        CGs chrn       ATs gcnt ty1cnt ty3cnt    gcntsc
## 1 14003 19695 90544 0.07933312    1 0.9206669  156     24      0 1.0000000
## 2 13018 18441 88693 0.00000000    1 1.0000000  141     24      0 0.9038462
## 3 14100 19606 89229 0.08714562    1 0.9128544  123     18     12 0.7884615
##      gcol  chr
## 1 #FFFF00 chr1
## 2 #FFEC00 chr1
## 3 #FFD800 chr1
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "66", sep = "")


library(ggplot2)
# Basic scatter plot
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=Id))
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=chr))
p <- ggplot(wins, aes(x=gcnt, y=ty3cnt))
#p <- p + geom_point(size=2, color = "#C0C0C0")
#p <- p + geom_point(size=2, color = "#77889966")
#
p <- p + geom_point(size=2, color = "#70809044")
#p <- p + geom_point(size=2)

#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 2)

p <- p + geom_text(x=100, y=90, label="y = (-0.4)x + 56.7", size = 4)

p <- p + theme_bw()
p <- p + theme(legend.position = "none")
#p <- p + theme(legend.title = element_blank()) 
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Genes per window")
p <- p + ylab("Ty3 elements per window")


p3 <- p
p3
## `geom_smooth()` using formula = 'y ~ x'

lm1 <- lm( wins$ty3cnt ~ wins$gcnt )
summary(lm1)
## 
## Call:
## lm(formula = wins$ty3cnt ~ wins$gcnt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.700 -11.859   0.224  10.512  64.935 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56.70020    1.09209   51.92   <2e-16 ***
## wins$gcnt   -0.40185    0.02219  -18.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.43 on 742 degrees of freedom
## Multiple R-squared:  0.3065, Adjusted R-squared:  0.3056 
## F-statistic: 327.9 on 1 and 742 DF,  p-value: < 2.2e-16
lm2 <- lm( wins$ty1cnt ~ wins$gcnt )
summary(lm2)
## 
## Call:
## lm(formula = wins$ty1cnt ~ wins$gcnt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.909 -13.025  -1.495   9.910  62.720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.71174    1.02287  30.025  < 2e-16 ***
## wins$gcnt    0.06265    0.02078   3.014  0.00266 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.32 on 742 degrees of freedom
## Multiple R-squared:  0.0121, Adjusted R-squared:  0.01077 
## F-statistic: 9.087 on 1 and 742 DF,  p-value: 0.002663

Graphic

#library(magick)
#ggdraw() +
#  cowplot::draw_image("https://i.stack.imgur.com/WDOo4.jpg?s=328&g=1") #+
#  draw_plot(my_plot)

Figure 1 (Composite plot)

library("ggpubr")

ggarrange(
  plotlist = list(p1, 
                  ggarrange(p3, p2, 
                            ncol = 2, nrow = 1, 
                            widths = c(2, 3),
                            labels = c("B", "C"))
                  ),
  labels = c("A", ""),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

Figure 1. Genomic architecture of the Cannabis sativa genome. (A) Ideogram of the ERBxHO40_23 haplotype A genome. Each chromosome is divided into 1 Mbp windows where each window’s width is proportional to the inverse abundance of the motif ‘CG’ and each window is colored according to gene density. Windows with high ‘CG’ abundance are narrow and windows with high gene counts are yellow. The plant ERBxHO40_23 resulted from a cross between strains ERB (chemotype III or CBD dominant) and HO40 (chemotype I or THC dominant). The genetically female plant HO40 (XX) was chemically masculinized to produce pollen flowers to facilitate the cross. Subtelomeric repeat motifs are marked with blue horizontal lines. Cannabinoid synthase genes are marked with green horizontal lines on chromosome seven. (B) The abundances of genes and Ty3 LTRs are inversely related. Each dot represents a 1 Mbp window from the ERBxHO40_23 haplotype A assembly, organized by the number of genes and Ty3 LTR elements contained in each window. Windows with high quantities of genes have low Ty3 LTR counts. The genes in ERBxHO40 are predominantly near the ends of each chromosome, while the central portion of each chromosome appears populated by Ty3 LTR elements. (C) Long chromosomes have more genes, however some chromosomes demonstrate exceptional gene content. Chromosome Y is the longest chromosome but includes a number of genes that is comparable to other chromosomes. Each dot represents a chromosome from phased and assembled haplotypes (n = 69; X = 65; Y = 4). Chromosome 1 is of average length but contains more genes than other chromosomes of a similar length. Chromosomes X and 8 also have more genes than chromosomes of similar length. Chromosome 7, where the cannabinoid synthases (CBCAS, CBDAS, THCAS) are found, has the lowest number of genes.